###############################################################################
###############   Appendix Tables  S-6 to S-49 and Figure S-4   ###############
###############################################################################

# All MTI validation tables and other figures in the appendix
# can be replicated via the STATA do file for Appendix

library(data.table)
library(dtplyr)
library(tidyverse)
library(fixest)
library(readstata13)
library(MatchIt)
library(CBPS)
library(modelsummary)

setwd("YOUR WORKING DIRECTORY")


################################################################################
###                     Patent-level regressions
################################################################################

### LOAD IN DATA FOR ANALYSIS
read_stata <- function(x) read.dta13(x) %>% as_tibble()

# Counts of number female/male authors on a patent -- used to create if a 
# team is majority female or majority male
inventor_gender_counts <- read_stata("clean_data/patent_data/inventor_gender_counts.dta")

# Data generated using 2019 MeSH terms and 2,500 characters from patent
patent_gbd <- read_stata("clean_data/patent_data/patent_gbd_level.dta")
patent_mesh_raw <- read_stata("clean_data/patent_data/patent_mesh_raw.dta")
patent_mesh_and_tree <- read_stata("clean_data/patent_data/patent_mesh_and_tree.dta")

# Patent citation/input data
patent_inputs <- read_stata("clean_data/patent_data/patent_gbd_with_paper_pair_variables.dta")

# Data gemerated isomg 2020 MeSH terms and 10k characters from patent
patent_gbd_10k <- read_stata("clean_data/patent_data/patent_gbd_level_tas_10000.dta")
patent_mesh_raw_10k <- read_stata("clean_data/patent_data/patent_mesh_raw_tas_10000.dta")
patent_mesh_and_tree_10k <- read_stata("clean_data/patent_data/patent_mesh_and_tree_tas_10000.dta")

patent_gbd_ta <- read_stata("clean_data/patent_data/patent_gbd_level_ta_2500.dta")
patent_mesh_raw_ta <- read_stata("clean_data/patent_data/patent_mesh_raw_ta_2500.dta")
patent_mesh_and_tree_ta <- read_stata("clean_data/patent_data/patent_mesh_and_tree_ta_2500.dta")

patent_gbd_strict <- read_stata("clean_data/patent_data/patent_gbd_level_tas_2500 strict.dta")
patent_mesh_raw_strict <- read_stata("clean_data/patent_data/patent_mesh_raw_tas_2500 strict.dta")
patent_mesh_and_tree_strict <- read_stata("clean_data/patent_data/patent_mesh_and_tree_tas_2500 strict.dta")

## merge the GBD data with gender counts data (this is the main dataset)

pat_reg_data <- 
  patent_gbd %>% 
  left_join(inventor_gender_counts) %>%
  group_by(patent_id) %>% 
  filter(row_number() == 1) %>% 
  ungroup() %>% 
  mutate(
    min_f = (female_count > 0) & (male_count > female_count),
    maj_f = (female_count >= male_count) & (male_count > 0),
    all_f = (female_count > 0) & (male_count == 0),
    all_male = female_count == 0 & male_count > 0,
    pf100 = 100*patent_female,
    pm100 = 100*patent_male, 
    yr = patent_year,
    sc = subcategory_id,
    sz = pat_team_sz
  ) %>% 
  filter(
    !is.na(female_count), !is.na(male_count)
  ) %>% 
  mutate(
    asg_company = ifelse(is.na(asg_company), 0, asg_company)
  )

################     S-47     ####################

datasummary((`Team Size` = sz) + (`Year` = yr) + 
              (`Female Mesh` = patent_female) + (`Male Mesh` = patent_male) + 
              (`Minority Female Team` = min_f) + (`Majority Female Team` = maj_f) + 
              (`All Female Team` = all_f) ~ Mean + Median + (Std.Dev. = SD) + Max + 
              Min + (N = 1), data = pat_reg_data %>% 
              mutate(
                min_f = as.numeric(min_f), 
                maj_f = as.numeric(maj_f), 
                all_f = as.numeric(all_f)
              ))

################     S-48     ####################

datasummary((`Team Size` = sz) + (`Year` = yr) + 
              (`Female Mesh` = patent_female) + (`Male Mesh` = patent_male) + 
              (`Minority Female Team` = min_f) + (`Majority Female Team` = maj_f) + 
              (`All Female Team` = all_f) ~ Mean + Median + (Std.Dev. = SD) + Max + 
              Min + (N = 1), data = pat_reg_data %>% 
              mutate(
                min_f = as.numeric(min_f), 
                maj_f = as.numeric(maj_f), 
                all_f = as.numeric(all_f)
              ) %>% 
              filter(yr > 2001))

## Results hold no matter how you do all_female 
m_all <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
               pat_reg_data)

m_decade1 <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                   pat_reg_data %>% filter(patent_year < 1990))

m_decade2 <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                   pat_reg_data %>% filter(patent_year > 1989 & patent_year < 2000))

m_decade3 <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                   pat_reg_data %>% filter(patent_year > 1999))

m_drugs <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                 pat_reg_data %>% filter(subcategory_id == 31))

m_surgery <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                   pat_reg_data %>% filter(subcategory_id == 32))

m_uni <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
               pat_reg_data %>% filter(asg_company != 1))

m_corp <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                pat_reg_data %>% filter(asg_company == 1))

####################  S-13   ###################

etable(m_all, m_decade1, m_decade2, m_decade3, m_drugs, m_surgery, m_uni, m_corp,
       cluster="patent_id")

## male version replication

male_m_all <- feols(pm100 ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                    pat_reg_data)

male_m_decade1 <- feols(pm100 ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                        pat_reg_data %>% filter(patent_year < 1990))

male_m_decade2 <- feols(pm100 ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                        pat_reg_data %>% filter(patent_year > 1989 & patent_year < 2000))

male_m_decade3 <- feols(pm100 ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                        pat_reg_data %>% filter(patent_year > 1999))

male_m_drugs <- feols(pm100 ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                      pat_reg_data %>% filter(subcategory_id == 31))

male_m_surgery <- feols(pm100 ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                        pat_reg_data %>% filter(subcategory_id == 32))

male_m_uni <- feols(pm100 ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                    pat_reg_data %>% filter(asg_company != 1))

male_m_corp <- feols(pm100 ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                     pat_reg_data %>% filter(asg_company == 1))

####################  S-17   ###################

etable(male_m_all, male_m_decade1, male_m_decade2, male_m_decade3, male_m_drugs, 
       male_m_surgery, male_m_uni, m_corp, cluster="patent_id")


################################################################################
###                     MATCHING ANALYSIS: EXACT MATCHING 
################################################################################

# Construct the Disease Area MeSH term Fixed Effect Variable
pat_disease_fe <- 
  patent_mesh_and_tree %>%  
  # Only keep the disease area MeSH tree numbers with at least 2-level depth
  filter(mesh_tree_num != "", str_sub(mesh_tree_num,1,1) == "C", str_count(mesh_tree_num, "\\.") >  2) %>% 
  group_by(patent_id, mesh_term) %>% 
  slice_head(n=1) %>% 
  ungroup() %>% 
  mutate(
    # convert the tree number to unique integers
    mesh_tree = as.numeric(as.factor(mesh_term)),
    mesh_rank = patent_mesh_rank
  ) %>% 
  dplyr::select(patent_id, mesh_tree, mesh_rank) %>% 
  arrange(patent_id, mesh_rank) %>% 
  group_by(patent_id) %>% 
  # keep the top 10 disease area tree number related to an article
  slice_head(n=10) %>% 
  mutate(m = row_number(mesh_rank)) %>% 
  ungroup() %>% 
  arrange(patent_id, mesh_rank) %>% 
  dplyr::select(patent_id, mesh_tree, m) %>% 
  spread(m, mesh_tree, sep="_", fill=0) 


# Final dataset used for exact matchings
pat_mesh_reg_data <- 
  pat_reg_data %>% 
  left_join(pat_disease_fe) %>% 
  filter(m_1 != 0, !is.na(m_1))


#Year-Subcategory-Team Size-Disease Area Matching
match_teams <- function(x, data) {
  matchme <- 
    data %>% 
    mutate(treat = !!sym(x)) %>% 
    filter(all_male | !!sym(x), !is.na(m_1)) %>% 
    dplyr::select(patent_id, pf100, pm100, treat, !!sym(x), yr, sc, sz, m_1)
  
  mout <- matchit(treat ~ yr + sc + m_1 + sz, 
                  data=matchme, 
                  method="exact")
  
  mdata <- match.data(mout) %>% as_tibble()
  mdata
}


### Matching process

dm_all_all_f <- match_teams('all_f', pat_mesh_reg_data)
dm_decade1_all_f <- match_teams('all_f', pat_mesh_reg_data %>% filter(patent_year < 1990))
dm_decade2_all_f <- match_teams('all_f', pat_mesh_reg_data %>% filter(patent_year > 1989 & patent_year < 2000))
dm_decade3_all_f <- match_teams('all_f', pat_mesh_reg_data %>% filter(patent_year > 1999))
dm_drugs_all_f <- match_teams('all_f', pat_mesh_reg_data %>% filter(subcategory_id == 31))
dm_surgery_all_f <- match_teams('all_f', pat_mesh_reg_data %>% filter(subcategory_id == 32))
dm_uni_all_f <- match_teams('all_f', pat_mesh_reg_data %>% filter(asg_company != 1))
dm_corp_all_f <- match_teams('all_f', pat_mesh_reg_data %>% filter(asg_company == 1))

dm_all_maj_f <- match_teams('maj_f', pat_mesh_reg_data)
dm_decade1_maj_f <- match_teams('maj_f', pat_mesh_reg_data %>% filter(patent_year < 1990))
dm_decade2_maj_f <- match_teams('maj_f', pat_mesh_reg_data %>% filter(patent_year > 1989 & patent_year < 2000))
dm_decade3_maj_f <- match_teams('maj_f', pat_mesh_reg_data %>% filter(patent_year > 1999))
dm_drugs_maj_f <- match_teams('maj_f', pat_mesh_reg_data %>% filter(subcategory_id == 31))
dm_surgery_maj_f <- match_teams('maj_f', pat_mesh_reg_data %>% filter(subcategory_id == 32))
dm_uni_maj_f <- match_teams('maj_f', pat_mesh_reg_data %>% filter(asg_company != 1))
dm_corp_maj_f <- match_teams('maj_f', pat_mesh_reg_data %>% filter(asg_company == 1))

dm_all_min_f <- match_teams('min_f', pat_mesh_reg_data)
dm_decade1_min_f <- match_teams('min_f', pat_mesh_reg_data %>% filter(patent_year < 1990))
dm_decade2_min_f <- match_teams('min_f', pat_mesh_reg_data %>% filter(patent_year > 1989 & patent_year < 2000))
dm_decade3_min_f <- match_teams('min_f', pat_mesh_reg_data %>% filter(patent_year < 1999))
dm_drugs_min_f <- match_teams('min_f', pat_mesh_reg_data %>% filter(subcategory_id == 31))
dm_surgery_min_f <- match_teams('min_f', pat_mesh_reg_data %>% filter(subcategory_id == 32))
dm_uni_min_f <- match_teams('min_f', pat_mesh_reg_data %>% filter(asg_company != 1))
dm_corp_min_f <- match_teams('min_f', pat_mesh_reg_data %>% filter(asg_company == 1))

### Matching regressions with female patent as DV

mm_all_all_f <- feols(pf100 ~ all_f | subclass, dm_all_all_f, weights = ~weights)
mm_decade1_all_f <- feols(pf100 ~ all_f | subclass, dm_decade1_all_f, weights = ~weights)
mm_decade2_all_f <- feols(pf100 ~ all_f | subclass, dm_decade2_all_f, weights = ~weights)
mm_decade3_all_f <- feols(pf100 ~ all_f | subclass, dm_decade3_all_f, weights = ~weights)
mm_drugs_all_f <- feols(pf100 ~ all_f | subclass, dm_drugs_all_f, weights = ~weights)
mm_surgery_all_f <- feols(pf100 ~ all_f | subclass, dm_surgery_all_f, weights = ~weights)
mm_uni_all_f <- feols(pf100 ~ all_f | subclass, dm_uni_all_f, weights = ~weights)
mm_corp_all_f <- feols(pf100 ~ all_f | subclass, dm_corp_all_f, weights = ~weights)

mm_all_maj_f <- feols(pf100 ~ maj_f | subclass, dm_all_maj_f, weights = ~weights)
mm_decade1_maj_f <- feols(pf100 ~ maj_f | subclass, dm_decade1_maj_f, weights = ~weights)
mm_decade2_maj_f <- feols(pf100 ~ maj_f | subclass, dm_decade2_maj_f, weights = ~weights)
mm_decade3_maj_f <- feols(pf100 ~ maj_f | subclass, dm_decade3_maj_f, weights = ~weights)
mm_drugs_maj_f <- feols(pf100 ~ maj_f | subclass, dm_drugs_maj_f, weights = ~weights)
mm_surgery_maj_f <- feols(pf100 ~ maj_f | subclass, dm_surgery_maj_f, weights = ~weights)
mm_uni_maj_f <- feols(pf100 ~ maj_f | subclass, dm_uni_maj_f, weights = ~weights)
mm_corp_maj_f <- feols(pf100 ~ maj_f | subclass, dm_corp_maj_f, weights = ~weights)

mm_all_min_f <- feols(pf100 ~ min_f | subclass, dm_all_min_f, weights = ~weights)
mm_decade1_min_f <- feols(pf100 ~ min_f | subclass, dm_decade1_min_f, weights = ~weights)
mm_decade2_min_f <- feols(pf100 ~ min_f | subclass, dm_decade2_min_f, weights = ~weights)
mm_decade3_min_f <- feols(pf100 ~ min_f | subclass, dm_decade3_min_f, weights = ~weights)
mm_drugs_min_f <- feols(pf100 ~ min_f | subclass, dm_drugs_min_f, weights = ~weights)
mm_surgery_min_f <- feols(pf100 ~ min_f | subclass, dm_surgery_min_f, weights = ~weights)
mm_uni_min_f <- feols(pf100 ~ min_f | subclass, dm_uni_min_f, weights = ~weights)
mm_corp_min_f <- feols(pf100 ~ min_f | subclass, dm_corp_min_f, weights = ~weights)


### Male version replication regressions

male_mm_all_all_f <- feols(pm100 ~ all_f | subclass, dm_all_all_f, weights = ~weights)
male_mm_decade1_all_f <- feols(pm100 ~ all_f | subclass, dm_decade1_all_f, weights = ~weights)
male_mm_decade2_all_f <- feols(pm100 ~ all_f | subclass, dm_decade2_all_f, weights = ~weights)
male_mm_decade3_all_f <- feols(pm100 ~ all_f | subclass, dm_decade3_all_f, weights = ~weights)
male_mm_drugs_all_f <- feols(pm100 ~ all_f | subclass, dm_drugs_all_f, weights = ~weights)
male_mm_surgery_all_f <- feols(pm100 ~ all_f | subclass, dm_surgery_all_f, weights = ~weights)
male_mm_uni_all_f <- feols(pm100 ~ all_f | subclass, dm_uni_all_f, weights = ~weights)
male_mm_corp_all_f <- feols(pm100 ~ all_f | subclass, dm_corp_all_f, weights = ~weights)

male_mm_all_maj_f <- feols(pm100 ~ maj_f | subclass, dm_all_maj_f, weights = ~weights)
male_mm_decade1_maj_f <- feols(pm100 ~ maj_f | subclass, dm_decade1_maj_f, weights = ~weights)
male_mm_decade2_maj_f <- feols(pm100 ~ maj_f | subclass, dm_decade2_maj_f, weights = ~weights)
male_mm_decade3_maj_f <- feols(pm100 ~ maj_f | subclass, dm_decade3_maj_f, weights = ~weights)
male_mm_drugs_maj_f <- feols(pm100 ~ maj_f | subclass, dm_drugs_maj_f, weights = ~weights)
male_mm_surgery_maj_f <- feols(pm100 ~ maj_f | subclass, dm_surgery_maj_f, weights = ~weights)
male_mm_uni_maj_f <- feols(pm100 ~ maj_f | subclass, dm_uni_maj_f, weights = ~weights)
male_mm_corp_maj_f <- feols(pm100 ~ maj_f | subclass, dm_corp_maj_f, weights = ~weights)

male_mm_all_min_f <- feols(pm100 ~ min_f | subclass, dm_all_min_f, weights = ~weights)
male_mm_decade1_min_f <- feols(pm100 ~ min_f | subclass, dm_decade1_min_f, weights = ~weights)
male_mm_decade2_min_f <- feols(pm100 ~ min_f | subclass, dm_decade2_min_f, weights = ~weights)
male_mm_decade3_min_f <- feols(pm100 ~ min_f | subclass, dm_decade3_min_f, weights = ~weights)
male_mm_drugs_min_f <- feols(pm100 ~ min_f | subclass, dm_drugs_min_f, weights = ~weights)
male_mm_surgery_min_f <- feols(pm100 ~ min_f | subclass, dm_surgery_min_f, weights = ~weights)
male_mm_uni_min_f <- feols(pm100 ~ min_f | subclass, dm_uni_min_f, weights = ~weights)
male_mm_corp_min_f <- feols(pm100 ~ min_f | subclass, dm_corp_min_f, weights = ~weights)

####################  S-14   ###################

etable(
  mm_all_min_f,
  mm_decade1_min_f,
  mm_decade2_min_f,
  mm_decade3_min_f, 
  mm_drugs_min_f,
  mm_surgery_min_f,
  mm_uni_min_f,
  mm_corp_min_f,
  cluster="patent_id")

####################  S-15   ###################

etable(
  mm_all_maj_f,
  mm_decade1_maj_f,
  mm_decade2_maj_f,
  mm_decade3_maj_f, 
  mm_drugs_maj_f,
  mm_surgery_maj_f,
  mm_uni_maj_f,
  mm_corp_maj_f,
  cluster="patent_id")

####################  S-16   ###################

etable(
  mm_all_all_f, 
  mm_decade1_all_f, 
  mm_decade2_all_f, 
  mm_decade3_all_f, 
  mm_drugs_all_f,
  mm_surgery_all_f,
  mm_uni_all_f,
  mm_corp_all_f,
  cluster="patent_id")


####################  S-18   ###################

etable(
  male_mm_all_min_f,
  male_mm_decade1_min_f,
  male_mm_decade2_min_f,
  male_mm_decade3_min_f, 
  male_mm_drugs_min_f,
  male_mm_surgery_min_f,
  male_mm_uni_min_f,
  male_mm_corp_min_f,
  cluster="patent_id")

####################  S-19   ###################

etable(
  male_mm_all_maj_f,
  male_mm_decade1_maj_f,
  male_mm_decade2_maj_f,
  male_mm_decade3_maj_f, 
  male_mm_drugs_maj_f,
  male_mm_surgery_maj_f,
  male_mm_uni_maj_f,
  male_mm_corp_maj_f,
  cluster="patent_id")

####################  S-20   ###################

etable(
  male_mm_all_all_f, 
  male_mm_decade1_all_f, 
  male_mm_decade2_all_f, 
  male_mm_decade3_all_f, 
  male_mm_drugs_all_f,
  male_mm_surgery_all_f,
  male_mm_uni_all_f,
  male_mm_corp_all_f,
  cluster="patent_id")


### Male replication figure

# Plotting functions used (original source: https://github.com/ArielOrtizBobea/spec_chart/blob/master/spec_chart_function.R)
source("build_tables_figures/spec_chart_no_dot.R")
source("build_tables_figures/spec_chart_only_dots.R")

## the function to generate one graph
chart <- function(basic, matchmin, matchmaj, matchall, titletxt = NA, label = T) {
  #basic coef
  coef_basic <- NULL
  for (ests in 1:3) {
    coef_basic <- rbind(coef_basic, get(basic)$coefficients[[ests]])
  }
  
  #match coef
  coef_match <- NULL
  for (reg in c(matchmin, matchmaj, matchall)) {
    coef_match <- rbind(coef_match, get(reg)$coefficients[[1]])
  }
  
  #combined coef
  coef <- rbind(matrix(coef_basic[1]), matrix(coef_match[1]), 
                matrix(coef_basic[2]), matrix(coef_match[2]), 
                matrix(coef_basic[3]), matrix(coef_match[3]))
  
  #basic CI95
  ci_95_basic <- NULL
  for (ests in 1:3) {
    ci_95_basic <- rbind(ci_95_basic, unname(unlist(confint(get(basic))[ests,])))
  }
  
  #match CI95
  ci_95_match <- NULL
  for (reg in c(matchmin, matchmaj, matchall)) {
    ci_95_match <- rbind(ci_95_match, unname(unlist(confint(get(reg))[1,])))
  }
  
  #combined CI95
  ci_95 <- rbind(ci_95_basic[1,], ci_95_match[1,], 
                 ci_95_basic[2,], ci_95_match[2,],
                 ci_95_basic[3,], ci_95_match[3,])
  #basic CI99
  ci_99_basic <- NULL
  for (ests in 1:3) {
    ci_99_basic <- rbind(ci_99_basic, unname(unlist(confint(get(basic), level = 0.99)[ests,])))
  }
  
  #match CI99
  ci_99_match <- NULL
  for (reg in c(matchmin, matchmaj, matchall)) {
    ci_99_match <- rbind(ci_99_match, unname(unlist(confint(get(reg), level = 0.99)[1,])))
  }
  
  #combined CI99
  ci_99 <- rbind(ci_99_basic[1,], ci_99_match[1,], 
                 ci_99_basic[2,], ci_99_match[2,],
                 ci_99_basic[3,], ci_99_match[3,])
  
  #Estimate indicators
  fmin <- matrix(rep(c(T, F), c(2, 4)))
  fmaj <- matrix(rep(c(F, T, F), c(2, 2, 2)))
  fall <- matrix(rep(c(F, T), c(4, 2)))
  vars <- cbind(fmin, fmaj, fall)
  
  #Prepare the final data
  patent_reg <- data.frame(cbind(coef, ci_95, ci_99, vars))
  for (i in 6:8) {
    patent_reg[i] <- as.logical(patent_reg[, i])
  }
  
  colnames(patent_reg) <- c("coef", "ci95_l", "co95_h", "ci99_l", "ci99_h", "fmin",
                            "fmaj", "fall")
  
  patent_reg <- dplyr::select(patent_reg, -c(fmin, fmaj, fall))
  
  #Set labels
  labels <- list("Estimates" = c("Minority Female", "Majority Female", "All Female"))
  
  #plot
  if (label == T) {
    schart_no_dot(patent_reg, n = 2, index.est = 1, index.se = NA, index.ci = 2:5, 
                  ylim = c(-3, 10), adj=c(1,1), offset=c(1.5, 1), ylab="Estimated Effect", 
                  pch.dot=c(21,21,21,21), col.dot=c("black","grey","white","black"),
                  bg.dot=c("black","white","white","black"), lwd.symbol=2, heights = c(1, 1), 
                  highlight=c(2, 4, 6), col.est=c("black","royalblue"), titleTXT = titletxt, 
                  col.est2=c("grey60","lightskyblue"), cex = c(1, 0.8), leftmargin = 12)
    ticks <- seq(-3, 10, 1)
    axis(side=2, at=ticks, las = 2, tck=-0.025)
  } else {
    schart_no_dot(patent_reg, n = 2, index.est = 1, index.se = NA, index.ci = 2:5, 
                  ylim = c(-3, 10), adj=c(1,1), offset=c(1.5, 1), ylab=NA, 
                  pch.dot=c(21,21,21,21), col.dot=c("black","grey","white","black"),
                  bg.dot=c("black","white","white","black"), lwd.symbol=2, heights = c(1, 1), 
                  highlight=c(2, 4, 6), col.est=c("black","royalblue"), titleTXT = titletxt, 
                  col.est2=c("grey60","lightskyblue"), cex = c(1, 0.8), leftmargin = 12)
    ticks <- seq(-3, 10, 1)
    axis(side=2, at=ticks, las = 2, tck=-0.025)
  }
}

## the function to generate one dot graph
chart_dot <- function(basic, matchmin, matchmaj, matchall, label = T) {
  #basic coef
  coef_basic <- NULL
  for (ests in 1:3) {
    coef_basic <- rbind(coef_basic, get(basic)$coefficients[[ests]])
  }
  
  #match coef
  coef_match <- NULL
  for (reg in c(matchmin, matchmaj, matchall)) {
    coef_match <- rbind(coef_match, get(reg)$coefficients[[1]])
  }
  
  #combined coef
  coef <- rbind(matrix(coef_basic[1]), matrix(coef_match[1]), 
                matrix(coef_basic[2]), matrix(coef_match[2]), 
                matrix(coef_basic[3]), matrix(coef_match[3]))
  
  #basic CI95
  ci_95_basic <- NULL
  for (ests in 1:3) {
    ci_95_basic <- rbind(ci_95_basic, unname(unlist(confint(get(basic))[ests,])))
  }
  
  #match CI95
  ci_95_match <- NULL
  for (reg in c(matchmin, matchmaj, matchall)) {
    ci_95_match <- rbind(ci_95_match, unname(unlist(confint(get(reg))[1,])))
  }
  
  #combined CI95
  ci_95 <- rbind(ci_95_basic[1,], ci_95_match[1,], 
                 ci_95_basic[2,], ci_95_match[2,],
                 ci_95_basic[3,], ci_95_match[3,])
  #basic CI99
  ci_99_basic <- NULL
  for (ests in 1:3) {
    ci_99_basic <- rbind(ci_99_basic, unname(unlist(confint(get(basic), level = 0.99)[ests,])))
  }
  
  #match CI99
  ci_99_match <- NULL
  for (reg in c(matchmin, matchmaj, matchall)) {
    ci_99_match <- rbind(ci_99_match, unname(unlist(confint(get(reg), level = 0.99)[1,])))
  }
  
  #combined CI99
  ci_99 <- rbind(ci_99_basic[1,], ci_99_match[1,], 
                 ci_99_basic[2,], ci_99_match[2,],
                 ci_99_basic[3,], ci_99_match[3,])
  
  #Estimate indicators
  fmin <- matrix(rep(c(T, F), c(2, 4)))
  fmaj <- matrix(rep(c(F, T, F), c(2, 2, 2)))
  fall <- matrix(rep(c(F, T), c(4, 2)))
  vars <- cbind(fmin, fmaj, fall)
  
  #Prepare the final data
  patent_reg <- data.frame(cbind(coef, ci_95, ci_99, vars))
  for (i in 6:8) {
    patent_reg[i] <- as.logical(patent_reg[, i])
  }
  
  colnames(patent_reg) <- c("coef", "ci95_l", "co95_h", "ci99_l", "ci99_h", "fmin",
                            "fmaj", "fall")
  
  #Set labels
  labels <- list("Estimates" = c("Minority Female", "Majority Female", "All Female"))
  labelsf <- list(" " = c(" ", " ", " "))
  
  #plot
  if (label == T) {
    schart_only_dots(patent_reg, labels, n = 2, index.est = 1, index.se = NA, index.ci = 2:5, 
                     ylim = c(-3, 11), adj=c(1,1), offset=c(0.3, 0), ylab="Estimated Effect", 
                     pch.dot=c(21,21,21,21), col.dot=c("black","grey","white","black"),
                     bg.dot=c("black","white","white","black"), lwd.symbol=2, heights = c(1, 1), 
                     highlight=c(2, 4, 6), col.est=c("black","royalblue"), 
                     col.est2=c("grey60","lightskyblue"), cex = c(1, 1), leftmargin = 12)
  } else {
    schart_only_dots(patent_reg, labelsf, n = 2, index.est = 1, index.se = NA, index.ci = 2:5, 
                     ylim = c(-3, 11), adj=c(1,1), offset=c(1.5, 1), ylab=NA, 
                     pch.dot=c(21,21,21,21), col.dot=c("black","grey","white","black"),
                     bg.dot=c("black","white","white","black"), lwd.symbol=2, heights = c(1, 1), 
                     highlight=c(2, 4, 6), col.est=c("black","royalblue"), 
                     col.est2=c("grey60","lightskyblue"), cex = c(1, 1), leftmargin = 12)
  }
}

####################  Figure S-5  ###################
par(mfrow = c(3,4), oma=c(1,7,1,1), xpd=F)
par(mar=c(0,2,3,1))
chart("m_all", "mm_all_min_f", "mm_all_maj_f", "mm_all_all_f", 
      titletxt = "(A) All")
chart("m_decade1", "mm_decade1_min_f", "mm_decade1_maj_f", "mm_decade1_all_f", 
      titletxt = "(B) 1970s and 1980s", label = F)
chart("m_decade2", "mm_decade2_min_f", "mm_decade2_maj_f", "mm_decade2_all_f", 
      titletxt = "(C) 1990s", label = F)
chart("m_decade3", "mm_decade3_min_f", "mm_decade3_maj_f", "mm_decade3_all_f", 
      titletxt = "(D) 2000s", label = F)

chart("m_corp", "mm_corp_min_f", "mm_corp_maj_f", "mm_corp_all_f", 
      titletxt = "(E) Corporate")
chart("m_uni", "mm_uni_min_f", "mm_uni_maj_f", "mm_uni_all_f", 
      titletxt = "(F) Non-Corporate", label = F)
chart("m_drugs", "mm_drugs_min_f", "mm_drugs_maj_f", "mm_drugs_all_f", 
      titletxt = "(G) Drugs", label = F)
chart("m_surgery", "mm_surgery_min_f", "mm_surgery_maj_f", "mm_surgery_all_f", 
      titletxt = "(H) Surgery", label = F)

par(mar=c(8,2,0,1))
chart_dot("m_all", "mm_all_min_f", "mm_all_maj_f", "mm_all_all_f", label = F)
chart_dot("m_all", "mm_all_min_f", "mm_all_maj_f", "mm_all_all_f", label = F)
chart_dot("m_all", "mm_all_min_f", "mm_all_maj_f", "mm_all_all_f", label = F)
chart_dot("m_all", "mm_all_min_f", "mm_all_maj_f", "mm_all_all_f", label = F)

####################  Figure S-4   ###################

# combine the graphs
par(mfrow = c(3,4), oma=c(1,7,1,1), xpd=F)
par(mar=c(0,2,3,1))
chart("male_m_all", "male_mm_all_min_f", "male_mm_all_maj_f", "male_mm_all_all_f", 
      titletxt = "(A) All")
chart("male_m_decade1", "male_mm_decade1_min_f", "male_mm_decade1_maj_f", "male_mm_decade1_all_f", 
      titletxt = "(B) 1970s and 1980s", label = F)
chart("male_m_decade2", "male_mm_decade2_min_f", "male_mm_decade2_maj_f", "male_mm_decade2_all_f", 
      titletxt = "(C) 1990s", label = F)
chart("male_m_decade3", "male_mm_decade3_min_f", "male_mm_decade3_maj_f", "male_mm_decade3_all_f", 
      titletxt = "(D) 2000s", label = F)

chart("male_m_corp", "male_mm_corp_min_f", "male_mm_corp_maj_f", "male_mm_corp_all_f", 
      titletxt = "(E) Corporate")
chart("male_m_uni", "male_mm_uni_min_f", "male_mm_uni_maj_f", "male_mm_uni_all_f", 
      titletxt = "(F) Non-Corporate", label = F)
chart("male_m_drugs", "male_mm_drugs_min_f", "male_mm_drugs_maj_f", "male_mm_drugs_all_f", 
      titletxt = "(G) Drugs", label = F)
chart("male_m_surgery", "male_mm_surgery_min_f", "male_mm_surgery_maj_f", "male_mm_surgery_all_f", 
      titletxt = "(H) Surgery", label = F)

par(mar=c(8,2,0,1))
chart_dot("male_m_all", "male_mm_all_min_f", "male_mm_all_maj_f", "male_mm_all_all_f", label = F)
chart_dot("male_m_all", "male_mm_all_min_f", "male_mm_all_maj_f", "male_mm_all_all_f", label = F)
chart_dot("male_m_all", "male_mm_all_min_f", "male_mm_all_maj_f", "male_mm_all_all_f", label = F)
chart_dot("male_m_all", "male_mm_all_min_f", "male_mm_all_maj_f", "male_mm_all_all_f", label = F)


## Only yaer-disease-subcategory
match_teams3 <- function(x, data) {
  matchme <- 
    data %>% 
    mutate(treat = !!sym(x)) %>% 
    filter(all_male | !!sym(x), !is.na(m_1)) %>% 
    dplyr::select(patent_id, pf100, pm100, treat, !!sym(x), yr, sc, sz, m_1)
  
  mout <- matchit(treat ~ yr + sc + m_1, 
                  data=matchme, 
                  method="exact")
  
  mdata <- match.data(mout) %>% as_tibble()
  mdata
}

dm3_all_all_f <- match_teams3('all_f', pat_mesh_reg_data)
dm3_all_maj_f <- match_teams3('maj_f', pat_mesh_reg_data)
dm3_all_min_f <- match_teams3('min_f', pat_mesh_reg_data)

mm3_all_all_f <- feols(pf100 ~ all_f | subclass + yr^sz, dm3_all_all_f, weights = ~weights)
mm3_all_maj_f <- feols(pf100 ~ maj_f | subclass + yr^sz, dm3_all_maj_f, weights = ~weights)
mm3_all_min_f <- feols(pf100 ~ min_f | subclass + yr^sz, dm3_all_min_f, weights = ~weights)

## Only disease-subcategory
match_teams2 <- function(x, data) {
  matchme <- 
    data %>% 
    mutate(treat = !!sym(x)) %>% 
    filter(all_male | !!sym(x), !is.na(m_1)) %>% 
    dplyr::select(patent_id, pf100, pm100, treat, !!sym(x), yr, sc, sz, m_1)
  
  mout <- matchit(treat ~ sc + m_1, 
                  data=matchme, 
                  method="exact")
  
  mdata <- match.data(mout) %>% as_tibble()
  mdata
}

dm2_all_all_f <- match_teams2('all_f', pat_mesh_reg_data)
dm2_all_maj_f <- match_teams2('maj_f', pat_mesh_reg_data)
dm2_all_min_f <- match_teams2('min_f', pat_mesh_reg_data)

mm2_all_all_f <- feols(pf100 ~ all_f | subclass + yr^sz, dm2_all_all_f, weights = ~weights)
mm2_all_maj_f <- feols(pf100 ~ maj_f | subclass + yr^sz, dm2_all_maj_f, weights = ~weights)
mm2_all_min_f <- feols(pf100 ~ min_f | subclass + yr^sz, dm2_all_min_f, weights = ~weights)

####################  S-29   ###################

etable(mm2_all_min_f, mm3_all_min_f, mm2_all_maj_f, 
       mm3_all_maj_f, mm2_all_all_f, mm3_all_all_f, 
       cluster="patent_id")

################################################################################
###                    Phased in FE models
################################################################################

mf1 <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz + yr^m_1, 
             pat_mesh_reg_data)

mf2 <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz + yr^m_1 + yr^m_2, 
             pat_mesh_reg_data)

mf3 <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz + yr^m_1 + yr^m_2 + yr^m_3, 
             pat_mesh_reg_data)

mf4 <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz + yr^m_1 + yr^m_2 + yr^m_3 + yr^m_4, 
             pat_mesh_reg_data)

mf5 <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz + yr^m_1 + yr^m_2 + yr^m_3 + yr^m_4 + yr^m_5, 
             pat_mesh_reg_data)

mf6 <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz + yr^m_1 + yr^m_2 + yr^m_3 + yr^m_4 + yr^m_5 +
               yr^m_6, 
             pat_mesh_reg_data)

mf7 <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz + yr^m_1 + yr^m_2 + yr^m_3 + yr^m_4 + yr^m_5 +
               yr^m_6 + yr^m_7, 
             pat_mesh_reg_data)

mf8 <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz + yr^m_1 + yr^m_2 + yr^m_3 + yr^m_4 + yr^m_5 +
               yr^m_6 + yr^m_7 + yr^m_8, 
             pat_mesh_reg_data)

mf9 <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz + yr^m_1 + yr^m_2 + yr^m_3 + yr^m_4 + yr^m_5 +
               yr^m_6 + yr^m_7 + yr^m_8 + yr^m_9, 
             pat_mesh_reg_data)

mf10 <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz + yr^m_1 + yr^m_2 + yr^m_3 + yr^m_4 + yr^m_5 +
                yr^m_6 + yr^m_7 + yr^m_8 + yr^m_9 + yr^m_10, 
              pat_mesh_reg_data)


####################  S-30   ###################

etable(mf1, mf2, mf3, mf4, mf5, mf6, mf7, mf8, mf9, mf10,
       cluster="patent_id")

################################################################################
###             Alternative DVs and alternative modeling strategies.
################################################################################

## Re-assemble pat_reg_data

pat_reg_data <- 
  patent_gbd %>% 
  left_join(inventor_gender_counts) %>%
  group_by(patent_id) %>% 
  filter(row_number() == 1) %>% 
  ungroup() %>% 
  mutate(
    min_f = (female_count > 0) & (male_count > female_count),
    maj_f = (female_count >= male_count) & (male_count > 0),
    all_f = (female_count > 0) & (male_count == 0),
    all_male = female_count == 0 & male_count > 0,
    pf100 = 100*patent_female,
    yr = patent_year,
    sc = subcategory_id,
    sz = pat_team_sz
  ) %>% 
  filter(
    !is.na(female_count), !is.na(male_count)
  ) %>% 
  mutate(
    asg_company = ifelse(is.na(asg_company), 0, asg_company)
  )


## have the gbd data merged with gender counts data, and add additional variables

pat_reg_data_gbd <- 
  patent_gbd %>% 
  left_join(inventor_gender_counts) %>%
  mutate(
    min_f = (female_count > 0) & (male_count > female_count),
    maj_f = (female_count >= male_count) & (male_count > 0),
    all_f = (female_count > 0) & (male_count == 0),
    all_male = female_count == 0 & male_count > 0,
    pf100 = 100*patent_female,
    yr = patent_year,
    sc = subcategory_id,
    sz = pat_team_sz,
    log_if = log(incidence_female+1),
    log_im = log(incidence_male+1),
    weights = 1 / num_disease_mesh_at_patent
  ) %>% 
  filter(
    !is.na(female_count), !is.na(male_count)
  ) %>% 
  mutate(
    asg_company = ifelse(is.na(asg_company), 0, asg_company)
  )

## Add NIH grant and patent paper pair info

pat_reg_data_input <- 
  pat_reg_data %>% 
  left_join(
    patent_inputs %>% 
      group_by(patent_id) %>% 
      slice_head(n=1) %>% 
      ungroup() %>% 
      dplyr::select(
        patent_id, 
        patent_has_a_marx_article,
        num_raw_marx_articles,
        num_f_raw_marx_article,
        pct_f_raw_marx_article,
        ln_num_f_raw_marx_article,
        num_matched_marx_articles,
        num_f_matched_marx_article,
        ln_num_f_matched_marx_article,
        pct_f_matched_marx_article,
        num_f_ex_article,
        ln_num_f_ex_article,
        pct_f_ex_article,
        num_ex_article,
        num_project
      )
  ) %>% 
  mutate(
    pct_f_raw_marx_article = pct_f_raw_marx_article >= 0.5,
    f_matched_marx_article = pct_f_matched_marx_article >= 0.5,
    f_ex_article = pct_f_ex_article >= 0.5
  )

## Add the mesh FE indicator
patent_mesh_reg_data_input <- 
  pat_reg_data_input %>%
  left_join(pat_disease_fe) %>% 
  filter(m_1 != 0, !is.na(m_1))

### Incidence regressions on patent check tags

m_validate_incidence_female1 <- 
  feols(log_if ~ patent_female + log_im,  
        data = pat_reg_data_gbd, weights = ~weights)

m_validate_incidence_female2 <- 
  feols(log_if ~ patent_female + patent_male + log_im, 
        data = pat_reg_data_gbd, weights = ~weights)

m_validate_incidence_male1 <- 
  feols(log_im ~ patent_male + log_if, 
        data = pat_reg_data_gbd, weights = ~weights)

m_validate_incidence_male2 <- 
  feols(log_im ~ patent_female + patent_male + log_if, 
        data = pat_reg_data_gbd, weights = ~weights)


####################  S-8   ###################

etable(m_validate_incidence_female1, m_validate_incidence_female2, 
       m_validate_incidence_male1, m_validate_incidence_male2, 
       cluster = "patent_id")

## Basline results
m_base_ols <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                    pat_reg_data)

#with m1
m_base_ols_m1 <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz + yr^m_1, 
                       pat_mesh_reg_data)

## Logit results
m_base_log <- feglm(patent_female ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                    pat_reg_data, family=binomial(link = "logit"))

#with m1
m_base_log_m1 <- feglm(patent_female ~ min_f + maj_f + all_f | yr^sc + yr^sz + yr^m_1, 
                       pat_mesh_reg_data, family=binomial(link = "logit"))

## Female Only
m_only_ols <- feols(pf_only ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                    pat_reg_data %>% mutate(pf_only = pf100*(patent_male == 0)))

#with m1
m_only_ols_m1 <- feols(pf_only ~ min_f + maj_f + all_f | yr^sc + yr^sz + yr^m_1, 
                       pat_mesh_reg_data %>% mutate(pf_only = pf100*(patent_male == 0)))

## Logit Female only
m_only_log <- feglm(pf_only ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                    pat_reg_data %>% mutate(pf_only = patent_female*(patent_male == 0)), 
                    family=binomial(link = "logit"))

#with m_1
m_only_log_m1 <- feglm(pf_only ~ min_f + maj_f + all_f | yr^sc + yr^sz + yr^m_1, 
                       pat_mesh_reg_data %>% mutate(pf_only = patent_female*(patent_male == 0)), 
                       family=binomial(link = "logit"))

## Patent paper-pair: Input research mainly focused on women?
m_pair_ols <- feols(f_matched_marx_article100 ~ min_f + maj_f + all_f | yr^sc + yr^sz + num_matched_marx_articles, 
                    pat_reg_data_input %>% 
                      mutate(f_matched_marx_article100 = f_matched_marx_article * 100))

m_pair_log <- feglm(f_matched_marx_article ~ min_f + maj_f + all_f | yr^sc + yr^sz + num_matched_marx_articles, 
                    pat_reg_data_input,
                    family=binomial(link = "logit"))

## Patent NIH: Inpurt resereach mainly foucsed on women?
m_nih_ols <- feols(f_ex_article100 ~ min_f + maj_f + all_f | yr^sc + yr^sz + num_ex_article, 
                   pat_reg_data_input %>%
                     mutate(f_ex_article100 = f_ex_article * 100))

m_nih_log <- feglm(f_ex_article ~ min_f + maj_f + all_f | yr^sc + yr^sz + num_ex_article, 
                   pat_reg_data_input,
                   family=binomial(link = "logit"))

## GBD OLS
m_gbd_ols <- feols(log_if ~ min_f + maj_f + all_f + log_im | yr^sc + yr^sz + num_disease_mesh_at_patent, 
                   data = pat_reg_data_gbd, weights = ~weights)

## GBD Count Model
m_gbd_pos <- feglm(incidence_female ~ min_f + maj_f + all_f + log_im | yr^sc + yr^sz + num_disease_mesh_at_patent,
                   pat_reg_data_gbd, weights = ~weights,
                   family=quasipoisson(link = "log"))

# Table of modelling strategy and shift in MTI DV

####################  S-21   ###################

etable(m_base_ols, m_base_ols_m1, m_base_log, m_base_log_m1, 
       m_only_ols, m_only_ols_m1, m_only_log, m_only_log_m1, 
       cluster = "patent_id")

####################  S-45   ###################

etable(m_pair_ols, m_pair_log, 
       m_nih_ols, m_nih_log, cluster = "patent_id")

################################################################################
###                    Different patent text results
################################################################################

## Re-assemble pat_reg_data

pat_reg_data <- 
  patent_gbd %>% 
  left_join(inventor_gender_counts) %>%
  group_by(patent_id) %>% 
  filter(row_number() == 1) %>% 
  ungroup() %>% 
  mutate(
    min_f = (female_count > 0) & (male_count > female_count),
    maj_f = (female_count >= male_count) & (male_count > 0),
    all_f = (female_count > 0) & (male_count == 0),
    all_male = female_count == 0 & male_count > 0,
    pf100 = 100*patent_female,
    yr = patent_year,
    sc = subcategory_id,
    sz = pat_team_sz
  ) %>% 
  filter(
    !is.na(female_count), !is.na(male_count)
  ) %>% 
  mutate(
    asg_company = ifelse(is.na(asg_company), 0, asg_company)
  )


## Get the disease area fixed effect

pat_disease_fe <- 
  patent_mesh_and_tree %>%  
  filter(mesh_tree_num != "", str_sub(mesh_tree_num,1,1) == "C", str_count(mesh_tree_num, "\\.") >  2) %>% 
  group_by(patent_id, mesh_term) %>% 
  slice_head(n=1) %>% 
  ungroup() %>% 
  mutate(
    mesh_tree = as.numeric(as.factor(mesh_term)),
    mesh_rank = patent_mesh_rank
  ) %>% 
  dplyr::select(patent_id, mesh_tree, mesh_rank) %>% 
  arrange(patent_id, mesh_rank) %>% 
  group_by(patent_id) %>% 
  slice_head(n=10) %>% 
  mutate(m = row_number(mesh_rank)) %>% 
  ungroup() %>% 
  arrange(patent_id, mesh_rank) %>% 
  dplyr::select(patent_id, mesh_tree, m) %>% 
  spread(m, mesh_tree, sep="_", fill=0) 


## add that into the main dataset

pat_mesh_reg_data <- 
  pat_reg_data %>% 
  left_join(pat_disease_fe) %>% 
  filter(m_1 != 0, !is.na(m_1))

## data with 10k article content characters fed MTI algorithm

pat_reg_data_10k <- 
  patent_gbd_10k %>% 
  left_join(inventor_gender_counts) %>%
  group_by(patent_id) %>% 
  filter(row_number() == 1) %>% 
  ungroup() %>% 
  mutate(
    min_f = (female_count > 0) & (male_count > female_count),
    maj_f = (female_count >= male_count) & (male_count > 0),
    all_f = (female_count > 0) & (male_count == 0),
    all_male = female_count == 0 & male_count > 0,
    pf100 = 100*patent_female,
    yr = patent_year,
    sc = subcategory_id,
    sz = pat_team_sz
  ) %>% 
  filter(
    !is.na(female_count), !is.na(male_count)
  ) %>% 
  mutate(
    asg_company = ifelse(is.na(asg_company), 0, asg_company)
  )

## having the disease area fixed effect

pat_disease_fe_10k <- 
  patent_mesh_and_tree_10k %>%  
  filter(mesh_tree_num != "", str_sub(mesh_tree_num,1,1) == "C", str_count(mesh_tree_num, "\\.") >  2) %>% 
  group_by(patent_id, mesh_term) %>% 
  slice_head(n=1) %>% 
  ungroup() %>% 
  mutate(
    mesh_tree = as.numeric(as.factor(mesh_term)),
    mesh_rank = patent_mesh_rank
  ) %>% 
  dplyr::select(patent_id, mesh_tree, mesh_rank) %>% 
  arrange(patent_id, mesh_rank) %>% 
  group_by(patent_id) %>% 
  slice_head(n=10) %>% 
  mutate(m = row_number(mesh_rank)) %>% 
  ungroup() %>% 
  arrange(patent_id, mesh_rank) %>% 
  dplyr::select(patent_id, mesh_tree, m) %>% 
  spread(m, mesh_tree, sep="_", fill=0) 

pat_mesh_reg_data_10k <- 
  pat_reg_data_10k %>% 
  left_join(pat_disease_fe_10k) %>% 
  filter(m_1 != 0, !is.na(m_1))

## data with only article's abstract and title fed MTI algorithm

pat_reg_data_ta <- 
  patent_gbd_ta %>% 
  left_join(inventor_gender_counts) %>%
  group_by(patent_id) %>% 
  filter(row_number() == 1) %>% 
  ungroup() %>% 
  mutate(
    min_f = (female_count > 0) & (male_count > female_count),
    maj_f = (female_count >= male_count) & (male_count > 0),
    all_f = (female_count > 0) & (male_count == 0),
    all_male = female_count == 0 & male_count > 0,
    pf100 = 100*patent_female,
    yr = patent_year,
    sc = subcategory_id,
    sz = pat_team_sz
  ) %>% 
  filter(
    !is.na(female_count), !is.na(male_count)
  ) %>% 
  mutate(
    asg_company = ifelse(is.na(asg_company), 0, asg_company)
  )

## add disease area fixed effect

pat_disease_fe_ta <- 
  patent_mesh_and_tree_ta %>%  
  filter(mesh_tree_num != "", str_sub(mesh_tree_num,1,1) == "C", str_count(mesh_tree_num, "\\.") >  2) %>% 
  group_by(patent_id, mesh_term) %>% 
  slice_head(n=1) %>% 
  ungroup() %>% 
  mutate(
    mesh_tree = as.numeric(as.factor(mesh_term)),
    mesh_rank = patent_mesh_rank
  ) %>% 
  dplyr::select(patent_id, mesh_tree, mesh_rank) %>% 
  arrange(patent_id, mesh_rank) %>% 
  group_by(patent_id) %>% 
  slice_head(n=10) %>% 
  mutate(m = row_number(mesh_rank)) %>% 
  ungroup() %>% 
  arrange(patent_id, mesh_rank) %>% 
  dplyr::select(patent_id, mesh_tree, m) %>% 
  spread(m, mesh_tree, sep="_", fill=0) 

pat_mesh_reg_data_ta <- 
  pat_reg_data_ta %>% 
  left_join(pat_disease_fe_ta) %>% 
  filter(m_1 != 0, !is.na(m_1))

## data with strict filtering fed MTI algorithm

pat_reg_data_strict <- 
  patent_gbd_strict %>% 
  left_join(inventor_gender_counts) %>%
  group_by(patent_id) %>% 
  filter(row_number() == 1) %>% 
  ungroup() %>% 
  mutate(
    min_f = (female_count > 0) & (male_count > female_count),
    maj_f = (female_count >= male_count) & (male_count > 0),
    all_f = (female_count > 0) & (male_count == 0),
    all_male = female_count == 0 & male_count > 0,
    pf100 = 100*patent_female,
    yr = patent_year,
    sc = subcategory_id,
    sz = pat_team_sz
  ) %>% 
  filter(
    !is.na(female_count), !is.na(male_count)
  ) %>% 
  mutate(
    asg_company = ifelse(is.na(asg_company), 0, asg_company)
  )

## disease area fixed effect

pat_disease_fe_strict <- 
  patent_mesh_and_tree_strict %>%  
  filter(mesh_tree_num != "", str_sub(mesh_tree_num,1,1) == "C", str_count(mesh_tree_num, "\\.") >  2) %>% 
  group_by(patent_id, mesh_term) %>% 
  slice_head(n=1) %>% 
  ungroup() %>% 
  mutate(
    mesh_tree = as.numeric(as.factor(mesh_term)),
    mesh_rank = patent_mesh_rank
  ) %>% 
  dplyr::select(patent_id, mesh_tree, mesh_rank) %>% 
  arrange(patent_id, mesh_rank) %>% 
  group_by(patent_id) %>% 
  slice_head(n=10) %>% 
  mutate(m = row_number(mesh_rank)) %>% 
  ungroup() %>% 
  arrange(patent_id, mesh_rank) %>% 
  dplyr::select(patent_id, mesh_tree, m) %>% 
  spread(m, mesh_tree, sep="_", fill=0) 

pat_mesh_reg_data_strict <- 
  pat_reg_data_strict %>% 
  left_join(pat_disease_fe_strict) %>% 
  filter(m_1 != 0, !is.na(m_1))

## the models

m_base <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                pat_reg_data)

mf_base <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz + yr^m_1, 
                 pat_mesh_reg_data)

m_base_ta <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                   pat_reg_data_ta)

mf_base_ta <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz + yr^m_1, 
                    pat_mesh_reg_data_ta)

m_base_strict <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                       pat_reg_data_strict)

mf_base_strict <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz + yr^m_1, 
                        pat_mesh_reg_data_strict)

m_base_10k <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                    pat_reg_data_10k)

mf_base_10k <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz + yr^m_1, 
                     pat_mesh_reg_data_10k)

####################  S-22   ###################

etable(m_base, mf_base,
       m_base_ta, mf_base_ta,
       m_base_strict, mf_base_strict,
       m_base_10k, mf_base_10k,
       cluster="patent_id")

### Humans and Animals patent tags

## construct the datasets
pats_human <- patent_mesh_raw %>% group_by(patent_id) %>% summarize(human_mesh = sum(mesh_term == "Humans" ))
pats_animal <- patent_mesh_raw %>% group_by(patent_id) %>% summarize(animal_mesh = sum(mesh_term == "Animals" ))

pat_reg_data_ha <- 
  pat_reg_data %>% 
  left_join(pats_human) %>% 
  left_join(pats_animal)

pat_mesh_reg_data_ha <- 
  pat_mesh_reg_data %>% 
  left_join(pats_human) %>% 
  left_join(pats_animal)

## models

m_human1 <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                  pat_reg_data_ha %>% filter(human_mesh == 1))

m_human2 <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz + yr^m_1, 
                  pat_mesh_reg_data_ha %>% filter(human_mesh == 1))

m_animal1 <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                   pat_reg_data_ha %>% filter(animal_mesh != 1))

m_animal2 <- feols(pf100 ~ min_f + maj_f + all_f | yr^sc + yr^sz + yr^m_1, 
                   pat_mesh_reg_data_ha %>% filter(animal_mesh != 1))


####################  S-23   ###################

etable(m_human1, m_human2, m_animal1, m_animal2, 
       cluster = "patent_id")

################################################################################
###               Patent Text Gender Prediction and Analysis
################################################################################
### Text examples presented in figures S5, S6, and S7

### Patent text data, first 2.5k and 10k characters
tas_2500 <- fread("clean_data/patent_data/patent_tas_2500.tsv", sep = "\t") %>% as_tibble() %>%
  
  # rename for distinguishing after merging
  rename(text_2500 = brf_sum_text)
  
  # only select id and summary text columns
  select(patent_id, brf_sum_text)

## Patent title text from PatentView
title_only <- fread("clean_data/patent_data/patent.tsv", sep = "\t", quote = "") %>% as_tibble() %>%
  
  # change patent id to numeric and get rid of NAs to be consistent with the 2 datasets above
  mutate(id = as.numeric(id)) %>%
  filter(!is.na(id)) %>% select(id, title)

## merge them together
pat_text <- tas_2500 %>%
  left_join(title_only, by = c("patent_id" = "id"))


## Save out a random sample of patents for human review
set.seed(12345)
randoms <- runif(nrow(pat_reg_data))
check_focus_sample <- 
  pat_reg_data %>% 
  mutate(
    team_female = (maj_f | all_f) + 0
  ) %>% 
  select(
    patent_id, patent_female, patent_male, team_female
  ) %>% 
  mutate(
    randoms = randoms
  ) %>% 
  arrange(
    patent_female, patent_male, randoms
  ) %>% 
  group_by(
    patent_female, patent_male
  ) %>% 
  mutate(
    rowseq = 1:n()
  ) %>% 
  filter(
    rowseq < 51
  ) %>% 
  left_join(
    pat_text %>% 
      select(patent_id, title, text_2500) %>% 
      mutate(patent_id = as.character(patent_id))
  )
write_csv(check_focus_sample, 
          "../pats_for_human_sex_review1.csv")

## After reviewing 200 we decided to review another 200 to increase the 
## the precision of our accuracy measures.
set.seed(90403)
randoms <- runif(nrow(pat_reg_data))
check_focus_sample <- 
  pat_reg_data %>% 
  mutate(
    team_female = (maj_f | all_f) + 0
  ) %>% 
  select(
    patent_id, patent_female, patent_male, team_female
  ) %>% 
  mutate(
    randoms = randoms
  ) %>% 
  arrange(
    patent_female, patent_male, randoms
  ) %>% 
  group_by(
    patent_female, patent_male
  ) %>% 
  mutate(
    rowseq = 1:n()
  ) %>% 
  filter(
    rowseq < 51
  ) %>% 
  left_join(
    pat_text %>% 
      select(patent_id, title, text_2500) %>% 
      mutate(patent_id = as.character(patent_id))
  )
write_csv(check_focus_sample, 
          "../pats_for_human_sex_review2.csv")


####################  S-7   ###################
## we did further formatting in Excel and tabulations too ##

### Make sure not biased by inventor gender, not stat sig differnces
tgb <- 
  read_csv("../pats_human_sex_review_team_gender.csv") %>% 
  select(patent_female, patent_male, male_only, female_only, both, team_female = team_gender) %>% 
  mutate(
    no = ((male_only+  female_only +  both) == 0) + 0
  )

tgbf <- tgb %>% filter(team_female == 1)
tgbm <- tgb %>% filter(team_female != 1)


tgbf_fp <- tgbf %>% filter(patent_female == 1, patent_male == 0)
tgbm_fp <- tgbm %>% filter(patent_female == 1, patent_male == 0)

tgbf_mp <- tgbf %>% filter(patent_female == 0, patent_male == 1)
tgbm_mp <- tgbm %>% filter(patent_female == 0, patent_male == 1)

tgbf_b <- tgbf %>% filter(patent_female == 1, patent_male == 1)
tgbm_b <- tgbm %>% filter(patent_female == 1, patent_male == 1)


tgbf_no <- tgbf %>% filter(patent_female == 0, patent_male == 0)
tgbm_no <- tgbm %>% filter(patent_female == 0, patent_male == 0)


# Female Invention
# No difference in the accuracy for female majority/minority teams
prop.test(
  x = c(sum(tgbf_fp$female_only), sum(tgbm_fp$female_only)),
  n = c(nrow(tgbf_fp), nrow(tgbm_fp))
)

# Male Invention
## No difference in the accuracy for female majority/minority teams
prop.test(
  x = c(sum(tgbf_mp$male_only), sum(tgbm_mp$male_only)),
  n = c(nrow(tgbf_mp), nrow(tgbm_mp))
)

### Neither
prop.test(
  x = c(sum(tgbf_no$no), sum(tgbm_no$no)),
  n = c(nrow(tgbf_no), nrow(tgbm_no))
)

### Both
prop.test(
  x = c(sum(tgbf_b$both), sum(tgbm_b$both)),
  n = c(nrow(tgbf_b), nrow(tgbm_b))
)

####################  S-46   ###################

## we did further formatting in Excel ##

library(tidylo)
######### WORDS MORE OR LESS LIKELY TO OCCUR IN MALE/FEMALE PATENTS  ###########
## See this to put some magnitudes, they are huge!!!
## https://stats.idre.ucla.edu/stata/faq/how-do-i-interpret-odds-ratios-in-logistic-regression/

tidy_patents <- 
  pat_text_clean %>% 
  mutate(
    patent_id = as.character(patent_id)
  ) %>% 
  left_join(
    pat_reg_data %>% 
      select(patent_id, patent_female, patent_male) 
  ) %>% 
  filter(
    !is.na(patent_female), !is.na(patent_male)
  ) %>% 
  select(patent_id, text, patent_female, patent_male) %>% 
  unnest_tokens(word, text) %>%
  filter(!word %in% stop_words$word,
         !word %in% str_remove_all(stop_words$word, "'"))

raw_logratio_female <- 
  tidy_patents %>% 
  mutate(
    isFemale = ifelse(patent_female == 1,  "Female", "NotFemale")
  ) %>% 
  count(isFemale, word, sort = TRUE) %>% 
  spread(isFemale, n, fill = 0) %>%
  mutate_if(is.numeric, list(~(. + 1) / (sum(.) + 1))) %>%
  mutate(logratio = log(Female / NotFemale)) %>%
  arrange(desc(logratio))

wlo_female <- 
  tidy_patents %>% 
  count(patent_female, word, sort = TRUE) %>% 
  bind_log_odds(patent_female, word, n, unweighted=TRUE) %>% 
  filter(patent_female == 1) %>% 
  left_join(
    raw_logratio_female %>% select(word, logratio)
  )


raw_logratio_male <- 
  tidy_patents %>% 
  mutate(
    isMale = ifelse(patent_male == 1,  "Male", "NotMale")
  ) %>% 
  count(isMale, word, sort = TRUE) %>% 
  spread(isMale, n, fill = 0) %>%
  mutate_if(is.numeric, list(~(. + 1) / (sum(.) + 1))) %>%
  mutate(logratio = log(Male / NotMale)) %>%
  arrange(desc(logratio))

wlo_male <- 
  tidy_patents %>% 
  count(patent_male, word, sort = TRUE) %>% 
  bind_log_odds(patent_male, word, n, unweighted=TRUE) %>% 
  filter(patent_male == 1) %>% 
  left_join(
    raw_logratio_male %>% select(word, logratio)
  )


wlo_female_top100 <- 
  wlo_female %>% 
  mutate(
    or = exp(logratio)
  ) %>% 
  filter(log_odds_weighted > 3.891, n > 99) %>% 
  top_n(100, or) %>%
  arrange(-1*or, word) 

wlo_male_top100 <- 
  wlo_male %>% 
  mutate(
    or = exp(logratio)
  ) %>% 
  filter(log_odds_weighted > 3.891, n > 99) %>% 
  top_n(100, or) %>%
  arrange(-1*or, word) 

####################  S-5 and S-6   ###################

write_csv(wlo_female_top100, "../wlo_female_top100.csv")
write_csv(wlo_male_top100, "../wlo_male_top100.csv")

## We did further formatting in Excel

### Now read in the clinical trial prediction results for patent text
# Load the female trial predictions in
pat_trial_female <- 
  read_delim("clean_data/patent_data/pat_predicted_female_trial.csv", 
             " ", col_names=c("label","estimate")) %>% 
  mutate(
    estimate = ifelse(label == "__label__All", 1 - estimate, estimate)
  ) %>% 
  select(
    estimate
  ) %>% 
  rename(
    female_trial_est = estimate
  ) %>% 
  mutate(
    patent_id = as.character(pat_text$patent_id), 
  )

# Load the male trial predictions in
pat_trial_male <- 
  read_delim("clean_data/patent_data/pat_predicted_male_trial.csv", 
             " ", col_names=c("label","estimate")) %>% 
  mutate(
    estimate = ifelse(label == "__label__All", 1 - estimate, estimate)
  ) %>% 
  select(
    estimate
  ) %>% 
  rename(
    male_trial_est = estimate
  ) %>% 
  mutate(
    patent_id = as.character(pat_text$patent_id)
  )

## Build patent level data with trial predictions
pat_trial_data <- 
  pat_reg_data %>% 
  left_join(
    pat_trial_female
  ) %>% 
  left_join(
    pat_trial_male
  ) %>% 
  left_join(
    pat_text %>% 
      select(patent_id, title) %>% 
      mutate(patent_id = as.character(pat_text$patent_id))
  )

## Build patent with mesh FE data with trial predictions
pat_mesh_trial_data <- 
  pat_mesh_reg_data %>% 
  left_join(
    pat_trial_female
  ) %>% 
  left_join(
    pat_trial_male
  ) %>% 
  left_join(
    pat_text %>% 
      select(patent_id, title) %>% 
      mutate(patent_id = as.character(pat_text$patent_id))
  ) %>% 
  mutate(
    ptf100 = female_trial_est*100
  ) %>% 
  filter(
    !is.na(ptf100)
  )

####################  S-9 and S-10   ###################

### TRY DECILES

set.seed(90403)
rows <- sample(nrow( pat_trial_data %>% filter(!is.na(male_trial_est), !is.na(female_trial_est))))
example_trial_pred_female <- 
  pat_trial_data %>% 
  filter(!is.na(male_trial_est), !is.na(female_trial_est)) %>% 
  left_join(
    pat_text %>% 
      mutate(patent_id = as.character(patent_id)) %>% 
      select(patent_id, title, text_2500)
  ) %>% 
  mutate(
    rows = rows,
    female_bucket = ceiling(round(female_trial_est,2)*10),
    female_bucket = ifelse(female_bucket == 0, 1, female_bucket)
  ) %>% 
  arrange(female_bucket, rows) %>% 
  group_by(female_bucket) %>% 
  filter(
    row_number() < 6
  ) %>% 
  select(patent_id, female_trial_est, male_trial_est, title, text_2500)

write_csv(example_trial_pred_female, 
          "../pat_trial_predictions_female10.csv")

rows <- sample(nrow( pat_trial_data %>% filter(!is.na(male_trial_est), !is.na(female_trial_est))))
example_trial_pred_male <- 
  pat_trial_data %>% 
  filter(!is.na(male_trial_est), !is.na(female_trial_est)) %>% 
  left_join(
    pat_text %>% 
      mutate(patent_id = as.character(patent_id)) %>% 
      select(patent_id, title, text_2500)
  ) %>% 
  mutate(
    rows = rows,
    male_bucket = ceiling(round(male_trial_est,2)*10),
    male_bucket = ifelse(male_bucket == 0, 1, male_bucket)
  ) %>% 
  arrange(male_bucket, rows) %>% 
  group_by(male_bucket) %>% 
  filter(
    row_number() < 6
  ) %>% 
  select(patent_id, female_trial_est, male_trial_est, title, text_2500)

write_csv(example_trial_pred_male, 
          "../pat_trial_predictions_male10.csv")

## We did further formatting in Excel ##

### CHECK TAGS PREDICT FEMALE TRIAL ESTIMATE 
### Primary results hold with our two other measures
m1 <- feols(log_if ~ min_f + maj_f + all_f + log_im | yr^sc + yr^sz, 
            data = pat_reg_data_gbd, weights = ~weights)

m3 <- feols(100*female_trial_est ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
            pat_trial_data)

m4 <- feols(100*(female_trial_est > 0.8) ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
            pat_trial_data)

m5 <- feols(log_im ~ min_f + maj_f + all_f + log_if | yr^sc + yr^sz, 
            data = pat_reg_data_gbd, weights = ~weights)

m7 <- feols(100*male_trial_est ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
            pat_trial_data)

m8 <- feols(100*(male_trial_est > 0.8) ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
            pat_trial_data)

####################  S-24   ###################

etable(m1, m3, m4, m5, m7, m8,
       digits = 2, cluster="patent_id")


### Main results replication: with predicted gender focus as the DV
# No disease FEs
m_all <- feols(100*female_trial_est ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
               pat_trial_data)

m_decade1 <- feols(100*female_trial_est ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                   pat_trial_data %>% filter(patent_year < 1990))

m_decade2 <- feols(100*female_trial_est ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                   pat_trial_data %>% filter(patent_year > 1989 & patent_year < 2000))

m_decade3 <- feols(100*female_trial_est ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                   pat_trial_data %>% filter(patent_year > 1999))

m_drugs <- feols(100*female_trial_est ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                 pat_trial_data %>% filter(subcategory_id == 31))

m_surgery <- feols(100*female_trial_est ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                   pat_trial_data %>% filter(subcategory_id == 32))

m_uni <- feols(100*female_trial_est ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
               pat_trial_data %>% filter(asg_company != 1))

m_corp <- feols(100*female_trial_est ~ min_f + maj_f + all_f | yr^sc + yr^sz, 
                pat_trial_data %>% filter(asg_company == 1))


####################  S-25   ###################

etable(m_all, m_decade1, m_decade2, m_decade3, m_drugs, m_surgery, m_uni, m_corp,
       cluster="patent_id")

### Replications continued: matching
#All 4
match_teams <- function(x, data) {
  matchme <- 
    data %>% 
    mutate(treat = !!sym(x)) %>% 
    filter(all_male | !!sym(x), !is.na(m_1)) %>% 
    dplyr::select(patent_id, ptf100, treat, !!sym(x), yr, sc, sz, m_1)
  
  mout <- matchit(treat ~ yr + sc + m_1 + sz, 
                  data=matchme, 
                  method="exact")
  
  mdata <- match.data(mout) %>% as_tibble()
  mdata
}

dm_all_all_f <- match_teams('all_f', pat_mesh_trial_data)
dm_decade1_all_f <- match_teams('all_f', pat_mesh_trial_data %>% filter(patent_year < 1990))
dm_decade2_all_f <- match_teams('all_f', pat_mesh_trial_data %>% filter(patent_year > 1989 & patent_year < 2000))
dm_decade3_all_f <- match_teams('all_f', pat_mesh_trial_data %>% filter(patent_year > 1999))
dm_drugs_all_f <- match_teams('all_f', pat_mesh_trial_data %>% filter(subcategory_id == 31))
dm_surgery_all_f <- match_teams('all_f', pat_mesh_trial_data %>% filter(subcategory_id == 32))
dm_uni_all_f <- match_teams('all_f', pat_mesh_trial_data %>% filter(asg_company != 1))
dm_corp_all_f <- match_teams('all_f', pat_mesh_trial_data %>% filter(asg_company == 1))

dm_all_maj_f <- match_teams('maj_f', pat_mesh_trial_data)
dm_decade1_maj_f <- match_teams('maj_f', pat_mesh_trial_data %>% filter(patent_year < 1990))
dm_decade2_maj_f <- match_teams('maj_f', pat_mesh_trial_data %>% filter(patent_year > 1989 & patent_year < 2000))
dm_decade3_maj_f <- match_teams('maj_f', pat_mesh_trial_data %>% filter(patent_year > 1999))
dm_drugs_maj_f <- match_teams('maj_f', pat_mesh_trial_data %>% filter(subcategory_id == 31))
dm_surgery_maj_f <- match_teams('maj_f', pat_mesh_trial_data %>% filter(subcategory_id == 32))
dm_uni_maj_f <- match_teams('maj_f', pat_mesh_trial_data %>% filter(asg_company != 1))
dm_corp_maj_f <- match_teams('maj_f', pat_mesh_trial_data %>% filter(asg_company == 1))

dm_all_min_f <- match_teams('min_f', pat_mesh_trial_data)
dm_decade1_min_f <- match_teams('min_f', pat_mesh_trial_data %>% filter(patent_year < 1990))
dm_decade2_min_f <- match_teams('min_f', pat_mesh_trial_data %>% filter(patent_year > 1989 & patent_year < 2000))
dm_decade3_min_f <- match_teams('min_f', pat_mesh_trial_data %>% filter(patent_year < 1999))
dm_drugs_min_f <- match_teams('min_f', pat_mesh_trial_data %>% filter(subcategory_id == 31))
dm_surgery_min_f <- match_teams('min_f', pat_mesh_trial_data %>% filter(subcategory_id == 32))
dm_uni_min_f <- match_teams('min_f', pat_mesh_trial_data %>% filter(asg_company != 1))
dm_corp_min_f <- match_teams('min_f', pat_mesh_trial_data %>% filter(asg_company == 1))


mm_all_all_f <- feols(ptf100 ~ all_f | subclass, dm_all_all_f, weights = ~weights)
mm_decade1_all_f <- feols(ptf100 ~ all_f | subclass, dm_decade1_all_f, weights = ~weights)
mm_decade2_all_f <- feols(ptf100 ~ all_f | subclass, dm_decade2_all_f, weights = ~weights)
mm_decade3_all_f <- feols(ptf100 ~ all_f | subclass, dm_decade3_all_f, weights = ~weights)
mm_drugs_all_f <- feols(ptf100 ~ all_f | subclass, dm_drugs_all_f, weights = ~weights)
mm_surgery_all_f <- feols(ptf100 ~ all_f | subclass, dm_surgery_all_f, weights = ~weights)
mm_uni_all_f <- feols(ptf100 ~ all_f | subclass, dm_uni_all_f, weights = ~weights)
mm_corp_all_f <- feols(ptf100 ~ all_f | subclass, dm_corp_all_f, weights = ~weights)

mm_all_maj_f <- feols(ptf100 ~ maj_f | subclass, dm_all_maj_f, weights = ~weights)
mm_decade1_maj_f <- feols(ptf100 ~ maj_f | subclass, dm_decade1_maj_f, weights = ~weights)
mm_decade2_maj_f <- feols(ptf100 ~ maj_f | subclass, dm_decade2_maj_f, weights = ~weights)
mm_decade3_maj_f <- feols(ptf100 ~ maj_f | subclass, dm_decade3_maj_f, weights = ~weights)
mm_drugs_maj_f <- feols(ptf100 ~ maj_f | subclass, dm_drugs_maj_f, weights = ~weights)
mm_surgery_maj_f <- feols(ptf100 ~ maj_f | subclass, dm_surgery_maj_f, weights = ~weights)
mm_uni_maj_f <- feols(ptf100 ~ maj_f | subclass, dm_uni_maj_f, weights = ~weights)
mm_corp_maj_f <- feols(ptf100 ~ maj_f | subclass, dm_corp_maj_f, weights = ~weights)

mm_all_min_f <- feols(ptf100 ~ min_f | subclass, dm_all_min_f, weights = ~weights)
mm_decade1_min_f <- feols(ptf100 ~ min_f | subclass, dm_decade1_min_f, weights = ~weights)
mm_decade2_min_f <- feols(ptf100 ~ min_f | subclass, dm_decade2_min_f, weights = ~weights)
mm_decade3_min_f <- feols(ptf100 ~ min_f | subclass, dm_decade3_min_f, weights = ~weights)
mm_drugs_min_f <- feols(ptf100 ~ min_f | subclass, dm_drugs_min_f, weights = ~weights)
mm_surgery_min_f <- feols(ptf100 ~ min_f | subclass, dm_surgery_min_f, weights = ~weights)
mm_uni_min_f <- feols(ptf100 ~ min_f | subclass, dm_uni_min_f, weights = ~weights)
mm_corp_min_f <- feols(ptf100 ~ min_f | subclass, dm_corp_min_f, weights = ~weights)


####################  S-26   ###################

etable(
  mm_all_min_f,
  mm_decade1_min_f,
  mm_decade2_min_f,
  mm_decade3_min_f, 
  mm_drugs_min_f,
  mm_surgery_min_f,
  mm_uni_min_f,
  mm_corp_min_f,
  cluster="patent_id")

####################  S-27   ###################

etable(
  mm_all_maj_f,
  mm_decade1_maj_f,
  mm_decade2_maj_f,
  mm_decade3_maj_f, 
  mm_drugs_maj_f,
  mm_surgery_maj_f,
  mm_uni_maj_f,
  mm_corp_maj_f,
  cluster="patent_id")

####################  S-28   ###################

etable(
  mm_all_all_f, 
  mm_decade1_all_f, 
  mm_decade2_all_f, 
  mm_decade3_all_f, 
  mm_drugs_all_f,
  mm_surgery_all_f,
  mm_uni_all_f,
  mm_corp_all_f,
  cluster="patent_id")

################################################################################
###                              Lasso results
################################################################################

## Note: To prepare the data for and run the lasso models, 
## at least 300 GB RAM is required

library(hdm)

# There are 4,000 unique disease MeSH terms
pat_disease_fe_wide <- 
  patent_mesh_and_tree %>%  
  filter(mesh_tree_num != "", str_sub(mesh_tree_num,1,1) == "C") %>% 
  group_by(patent_id, mesh_term) %>% 
  slice_head(n=1) %>% 
  ungroup() %>% 
  mutate(
    mesh_rank = patent_mesh_rank
  ) %>% 
  select(patent_id, mesh_term, mesh_rank, patent_mesh_score) %>% 
  arrange(patent_id, mesh_rank) %>% 
  group_by(patent_id) %>% 
  group_by(mesh_term) %>% 
  mutate(
    mesh_count = n(),
    mesh_dummy = 1,
    patent_mesh_score = log(patent_mesh_score)
  ) %>% 
  ungroup() %>% 
  select(patent_id, mesh_term, patent_mesh_score) %>% 
  pivot_wider(names_from = mesh_term, values_from = patent_mesh_score, values_fill = 0)


# customized merge function to merge the wide mesh term variables to pat_reg_data
left_join0 <- function(x, y, fill = 0L){
  z <- left_join(x, y)
  tmp <- setdiff(names(z), names(x))
  z <- replace_na(z, setNames(as.list(rep(fill, length(tmp))), tmp))
  z
}

# Set the seed
set.seed(02139)

##  2000s

# do this regression first because we always want year X subcategory 
# and year X team size in the model
m_resid <- feols(pf100 ~ 1 | yr^sc + yr^sz, pat_reg_data %>% filter(yr > 1999))
pf100_resid <- m_resid$residuals

# add the very wide mesh term structure to the main data with residual
pat_reg_data_hd <- 
  pat_reg_data %>% 
  filter(yr > 1999) %>% 
  select(patent_id, all_f, maj_f, min_f, yr) %>% 
  mutate(
    pf100_resid = pf100_resid
  ) %>% 
  left_join0(
    pat_disease_fe_wide
  ) %>% 
  select(-patent_id)


# run the double selection process
y <- pat_reg_data_hd %>% select(pf100_resid)
X <- pat_reg_data_hd %>% select(-pf100_resid, -yr) %>% as.matrix()
lasso_all <- rlassoEffects(x=X, y=y, index=1:3, method = "double selection")
summary(lasso_all)

# this is the model results file
saveRDS(lasso_all, "lasso_decade_2000s.rds")


##  1990s

m_resid <- feols(pf100 ~ 1 | yr^sc + yr^sz, pat_reg_data %>% filter(yr > 1989, yr < 2000))
pf100_resid <- m_resid$residuals

pat_reg_data_hd <- 
  pat_reg_data %>% 
  filter(yr > 1989, yr < 2000) %>% 
  select(patent_id, all_f, maj_f, min_f, yr) %>% 
  mutate(
    pf100_resid = pf100_resid
  ) %>% 
  left_join0(
    pat_disease_fe_wide
  ) %>% 
  select(-patent_id)

y <- pat_reg_data_hd %>% select(pf100_resid)
X <- pat_reg_data_hd %>% select(-pf100_resid, -yr) %>% as.matrix()
lasso_all <- rlassoEffects(x=X, y=y, index=1:3, method = "double selection")
summary(lasso_all)
saveRDS(lasso_all, "lasso_decade_1990s.rds")

##  1980s

m_resid <- feols(pf100 ~ 1 | yr^sc + yr^sz, pat_reg_data %>% filter(yr < 1990))
pf100_resid <- m_resid$residuals

pat_reg_data_hd <- 
  pat_reg_data %>% 
  filter(yr < 1990) %>% 
  select(patent_id, all_f, maj_f, min_f, yr) %>% 
  mutate(
    pf100_resid = pf100_resid
  ) %>% 
  left_join0(
    pat_disease_fe_wide
  ) %>% 
  select(-patent_id)

y <- pat_reg_data_hd %>% select(pf100_resid)
X <- pat_reg_data_hd %>% select(-pf100_resid, -yr) %>% as.matrix()
lasso_all <- rlassoEffects(x=X, y=y, index=1:3, method = "double selection")
summary(lasso_all)
saveRDS(lasso_all, "lasso_decade_1980s.rds")

### getting the model results directly from those rds files

#read rds files
for (i in c(1980, 1990, 2000)) {
  file_name <- str_c("lasso_decade_", i, "s.rds")
  file <- readRDS(file_name)
  assign(paste0("lasso_", i, "s"), file)
  rm(file_name, file)
}

####################  S-31   ###################

summary(lasso_1980s)
summary(lasso_1990s)
summary(lasso_2000s)


################################################################################
###                     Publication-level regressions
################################################################################

library(data.table)
library(dplyr)
library(tidyr)
library(stringr)
library(fixest)
library(MatchIt)
library(modelsummary)

### Publication data summary statistics

dat_sum <- fread("clean_data/article_data/pubmed2002_reg_meshC_types.csv")

dat_sum <- dat_sum[!is.na(binary_female)] %>% as_tibble() %>%
  filter(PMID %in% dat_1k$PMID)

####################  S-49   ###################

datasummary((`Team Size` = teamsize) + (`Year` = year) + (`Female Mesh` = binary_female) + (`Male Mesh` = binary_male) + 
              (`Minority Female Team` = min_f) + (`Majority Female Team` = maj_f) + (`All Female Team` = all_f) ~ Mean +
              Median + (Std.Dev. = SD) + Max + Min + (N = 1), data = dat_sum %>%
              mutate(
                min_f = as.numeric(min_f), 
                maj_f = as.numeric(maj_f), 
                all_f = as.numeric(all_f) 
              ))

####################  S-50   ###################

datasummary((`Team Size` = teamsize) + (`Year` = year) + (`Female Mesh` = binary_female) + (`Male Mesh` = binary_male) + 
              (`Minority Female Team` = min_f) + (`Majority Female Team` = maj_f) + (`All Female Team` = all_f) ~ Mean +
              Median + (Std.Dev. = SD) + Max + Min + (N = 1), data = dat %>%
              mutate(
                min_f = as.numeric(min_f), 
                maj_f = as.numeric(maj_f), 
                all_f = as.numeric(all_f) 
              ) %>% 
              filter(year > 2001 & year <= 2010))

### Main regressions

## load the full sample of research articles
dat <- fread("clean_data/article_data/pubmed2002_reg_meshC_types.csv")

# formatting the variables
dat$f100 <- dat$binary_female * 100
dat$m100 <- dat$binary_male * 100
dat$year <- as.numeric(as.factor(dat$year))
dat$teamsize <- as.numeric(as.factor(dat$teamsize))
dat$journal_id <- as.numeric(as.factor(dat$journal_id))
dat$all_f <- as.logical(dat$all_f)
dat$all_m <- as.logical(dat$all_m)
dat$min_f <- as.logical(dat$min_f)
dat$maj_f <- as.logical(dat$maj_f)

dat <- as_tibble(dat)


## load the cleaned JCIF 1k, 500, and 100 datasets
dat_1k <- fread("clean_data/article_data/jcif_1k_regdat.csv") %>% as_tibble()
dat_100 <- fread("clean_data/article_data/jcif_100_regdat.csv") %>% as_tibble()
dat_500 <- fread("clean_data/article_data/jcif_500_regdat.csv") %>% as_tibble()

### Regressions

# Baseline regressions using the full, top 1,000, top 500, and top 100 JCIF samples
# teamsize & journal
fe2_100 <- feols(f100 ~ min_f + maj_f + all_f | year^teamsize + year^journal_id, dat_100)
fe2_500 <- feols(f100 ~ min_f + maj_f + all_f | year^teamsize + year^journal_id, dat_500)
fe2_1k <- feols(f100 ~ min_f + maj_f + all_f | year^teamsize + year^journal_id, dat_1k)
fe2 <- feols(f100 ~ min_f + maj_f + all_f | year^teamsize + year^journal_id, dat)

# teamsize & journal & m1
fe3_100 <- feols(f100 ~ min_f + maj_f + all_f | year^teamsize + year^journal_id + year^m_1, 
                 dat_100 %>% filter(!is.na(m_1)))
fe3_500 <- feols(f100 ~ min_f + maj_f + all_f | year^teamsize + year^journal_id + year^m_1, 
                 dat_500 %>% filter(!is.na(m_1)))
fe3_1k <- feols(f100 ~ min_f + maj_f + all_f | year^teamsize + year^journal_id + year^m_1, 
                dat_1k %>% filter(!is.na(m_1)))
fe3 <- feols(f100 ~ min_f + maj_f + all_f | year^teamsize + year^journal_id + year^m_1, 
             dat %>% filter(!is.na(m_1)))

# Matching regressions 
# (the process of getting the year-team size- diseas matched data can be found in analysis_MainPaper.R)

## read the data:
#year-team-mesh/disease
dm_all_f <- as_tibble(fread("clean_data/article_data/dm_all_f_1k.csv")) %>%
  left_join(select(dat, PMID, m100), by = "PMID")

dm_maj_f <- as_tibble(fread("clean_data/article_data/dm_maj_f_1k.csv")) %>%
  left_join(select(dat, PMID, m100), by = "PMID")

dm_min_f <- as_tibble(fread("clean_data/article_data/dm_min_f_1k.csv")) %>%
  left_join(select(dat, PMID, m100), by = "PMID")

#year-team-journal
# matching based on Year-Team Size-Journal
match_teams <- function(x, data) {
  matchme <- 
    data %>% 
    mutate(treat = !!sym(x)) %>% 
    filter(all_m | !!sym(x), !is.na(m_1)) %>% 
    dplyr::select(PMID, f100, treat, !!sym(x), year, teamsize, journal_id, m_1)
  
  mout <- matchit(treat ~ year + journal_id + teamsize, 
                  data=matchme, 
                  method="exact")
  
  mdata <- match.data(mout) %>% as_tibble()
  mdata
}

# prepare the input data
dat_formatch <- dat_1k %>% as.data.table()

dat_formatch <- dat_formatch[!is.na(f100)]
dat_formatch <- as_tibble(dat_formatch)

# matched datasets based on year-team-journal
dmm_all_f <- match_teams('all_f', dat_formatch) %>%
  as_tibble() %>%
  rename(subclass_jn = subclass) %>%
  left_join(select(dat, PMID, m100, f100), by = "PMID")

dmm_maj_f <- match_teams('maj_f', dat_formatch) %>%
  as_tibble() %>%
  rename(subclass_jn = subclass) %>%
  left_join(select(dat, PMID, m100, f100), by = "PMID")

dmm_min_f <- match_teams('min_f', dat_formatch) %>%
  as_tibble() %>%
  rename(subclass_jn = subclass) %>%
  left_join(select(dat, PMID, m100, f100), by = "PMID")

#dm models
mm_all_f <- feols(f100 ~ all_f | year^journal_id + subclass, dm_all_f, weights = ~weights)
mm_maj_f <- feols(f100 ~ maj_f | year^journal_id + subclass, dm_maj_f, weights = ~weights)
mm_min_f <- feols(f100 ~ min_f | year^journal_id + subclass, dm_min_f, weights = ~weights)


#dmm models
#with yearXteamsize only
mmm_all_f <- feols(f100 ~ all_f | subclass_jn, dmm_all_f, weights = ~weights)
mmm_maj_f <- feols(f100 ~ maj_f | subclass_jn, dmm_maj_f, weights = ~weights)
mmm_min_f <- feols(f100 ~ min_f | subclass_jn, dmm_min_f, weights = ~weights)

#plus yearXm1

#add m1 back to dmm data
dat_forjoin <- 
  dat %>%
  dplyr::select(PMID, m_1)

dmm_all_f_m1 <- 
  dmm_all_f %>%
  left_join(dat_forjoin, by = "PMID") %>%
  filter(!is.na(m_1))

dmm_min_f_m1 <- 
  dmm_min_f %>%
  left_join(dat_forjoin, by = "PMID") %>%
  filter(!is.na(m_1))

dmm_maj_f_m1 <- 
  dmm_maj_f %>% 
  left_join(dat_forjoin, by = "PMID") %>%
  filter(!is.na(m_1)) 

mmm_all_f_m1 <- feols(f100 ~ all_f | year^m_1 + subclass_jn, dmm_all_f_m1, weights = ~weights)
mmm_maj_f_m1 <- feols(f100 ~ maj_f | year^m_1 + subclass_jn, dmm_maj_f_m1, weights = ~weights)
mmm_min_f_m1 <- feols(f100 ~ min_f | year^m_1 + subclass_jn, dmm_min_f_m1, weights = ~weights)

####################  S-40   ###################
etable(fe2, fe3, fe2_1k, fe3_1k, fe2_500, fe3_500, fe2_100, fe3_100, 
       cluster="PMID")

####################  S-32   ###################
etable(fe2_1k, mm_min_f, mm_maj_f, mm_all_f,
       cluster="PMID")

####################  S-41   ###################
etable(mmm_min_f, mmm_maj_f, mmm_all_f, mmm_min_f_m1, mmm_maj_f_m1, mmm_all_f_m1,
       cluster="PMID")

## now build the male version of figure 3
fe2_1k_m <- feols(m100 ~ min_f + maj_f + all_f | year^teamsize + year^journal_id, dat_1k)
mm_all_f_m <- feols(m100 ~ all_f | year^journal_id + subclass, dm_all_f, weights = ~weights)
mm_maj_f_m <- feols(m100 ~ maj_f | year^journal_id + subclass, dm_maj_f, weights = ~weights)
mm_min_f_m <- feols(m100 ~ min_f | year^journal_id + subclass, dm_min_f, weights = ~weights)

####################  S-39   ###################
etable(fe2_1k_m, mm_min_f_m, mm_maj_f_m, mm_all_f_m,   
       cluster="PMID")

### Analysis relates to disease gender incidence and paper gender focus prediction
## Load the cleaned prediction data (cleaning process can be found in pubmed_clean.R)
dat_pred <- fread("clean_data/article_data/dat_pred.csv") %>% as_tibble()


## Regressions
# mesh level
m_trial_f1 <- 
  feols(100*female_trial_est ~ binary_female, 
        data = dat_pred)

m_trial_f2 <- 
  feols(100*female_trial_est ~ binary_female + binary_male, 
        data = dat_pred)

m_trial_f3 <- 
  feols(100*(female_trial_est > 0.8) ~ binary_female + binary_male, 
        data = dat_pred)

m_trial_m1 <- 
  feols(100*male_trial_est ~ binary_male, 
        data = dat_pred)

m_trial_m2 <- 
  feols(100*male_trial_est ~ binary_male + binary_female, 
        data = dat_pred)

m_trial_m3 <- 
  feols(100*(male_trial_est > 0.8) ~ binary_female + binary_male, 
        data = dat_pred)

####################  S-38   ###################
etable(m_trial_f1, m_trial_f2, m_trial_f3,
       m_trial_m1, m_trial_m2, m_trial_m3,
       cluster="PMID")

# team gender level
tm_f1 <- feols(100*female_trial_est ~ min_f + maj_f + all_f | year^teamsize + year^journal_id, 
               dat_pred)

tm_f2 <- feols(100*(female_trial_est > 0.8) ~ min_f + maj_f + all_f | year^teamsize + year^journal_id, 
               dat_pred)

tm_m1 <- feols(100*male_trial_est ~ min_f + maj_f + all_f | year^teamsize + year^journal_id, 
               dat_pred)

tm_m2 <- feols(100*(male_trial_est > 0.8) ~ min_f + maj_f + all_f | year^teamsize + year^journal_id, 
               dat_pred)


####################  S-36 and S-37   ###################
### title examples based on prediction probabilities
set.seed(90403)
rows <- sample(nrow(dat_pred %>% filter(PMID %in% dat_1k$PMID, !is.na(male_trial_est), !is.na(female_trial_est))))
example_trial_pred_female <- 
  dat_pred %>% 
  filter(PMID %in% dat_1k$PMID, !is.na(male_trial_est), !is.na(female_trial_est)) %>% 
  mutate(
    rows = rows,
    female_bucket = ceiling(round(female_trial_est,2)*10),
    female_bucket = ifelse(female_bucket == 0, 1, female_bucket)
  ) %>% 
  arrange(female_bucket, rows) %>% 
  group_by(female_bucket) %>% 
  filter(
    row_number() < 6
  ) %>% 
  select(PMID, female_trial_est, male_trial_est, title, abstract)

write_csv(example_trial_pred_female, 
          "pubmed_trial_predictions_female10.csv")

rows <- sample(nrow(dat_pred %>% filter(PMID %in% dat_1k$PMID, !is.na(male_trial_est), !is.na(female_trial_est))))
example_trial_pred_male <- 
  dat_pred %>% 
  filter(PMID %in% dat_1k$PMID, !is.na(male_trial_est), !is.na(female_trial_est)) %>% 
  mutate(
    rows = rows,
    male_bucket = ceiling(round(male_trial_est,2)*10),
    male_bucket = ifelse(male_bucket == 0, 1, male_bucket)
  ) %>% 
  arrange(male_bucket, rows) %>% 
  group_by(male_bucket) %>% 
  filter(
    row_number() < 6
  ) %>% 
  select(PMID, female_trial_est, male_trial_est, title, abstract)

write_csv(example_trial_pred_male, 
          "pubmed_trial_predictions_male10.csv")

##
# We write out the CSV files and do further formatting in Excel 
##

####################  S-33 and S-34   ###################
### Example gender oriented words based on odds ratio and Z-Score
library(tidylo)
library(tidytext)

## all based on top 1k jcif articles
tidy_abstract <- 
  dat_pred %>% 
  select(PMID, binary_female, binary_male, abstract) %>% 
  filter(
    PMID %in% dat_1k$PMID,
    !is.na(binary_female), 
    !is.na(binary_male)
  ) %>% 
  mutate(abstract = abstract %>%
           str_to_lower() %>%
           str_replace_all("\\s+", " ") %>%
           str_trim()) %>%
  unnest_tokens(word, abstract) %>%
  filter(!word %in% stop_words$word,
         !word %in% str_remove_all(stop_words$word, "'"))

raw_logratio_female <- 
  tidy_abstract %>% 
  mutate(
    isFemale = ifelse(binary_female == 1,  "Female", "NotFemale")
  ) %>% 
  count(isFemale, word, sort = TRUE) %>% 
  spread(isFemale, n, fill = 0) %>%
  mutate_if(is.numeric, list(~(. + 1) / (sum(.) + 1))) %>%
  mutate(logratio = log(Female / NotFemale)) %>%
  arrange(desc(logratio))

wlo_female <- 
  tidy_abstract %>% 
  count(binary_female, word, sort = TRUE) %>% 
  bind_log_odds(binary_female, word, n, unweighted=TRUE) %>% 
  filter(binary_female == 1) %>% 
  left_join(
    raw_logratio_female %>% select(word, logratio)
  ) %>%
  filter(log_odds_weighted > 3.891, n > 99)


raw_logratio_male <- 
  tidy_abstract %>% 
  mutate(
    isMale = ifelse(binary_male == 1,  "Male", "NotMale")
  ) %>% 
  count(isMale, word, sort = TRUE) %>% 
  spread(isMale, n, fill = 0) %>%
  mutate_if(is.numeric, list(~(. + 1) / (sum(.) + 1))) %>%
  mutate(logratio = log(Male / NotMale)) %>%
  arrange(desc(logratio))


wlo_male <- 
  tidy_abstract %>% 
  count(binary_male, word, sort = TRUE) %>% 
  bind_log_odds(binary_male, word, n, unweighted=TRUE) %>% 
  filter(binary_male == 1) %>% 
  left_join(
    raw_logratio_male %>% select(word, logratio)
  ) %>%
  filter(log_odds_weighted > 3.891, n > 99)

# top 100 words with at least 10 z-score and by odds ratio
wlo_female_highor <- 
  wlo_female %>% 
  filter(log_odds_weighted > 10) %>%
  mutate(
    or = exp(logratio)
  ) %>%
  top_n(100, or) %>%
  arrange(-1*or, word) %>%
  mutate(log_odds_weighted = round(log_odds_weighted), 
         or = round(or, digits = 1))

# top 100 words with at least 10 z-score and by odds ratio
wlo_male_highor <- 
  wlo_male %>% 
  filter(log_odds_weighted > 10) %>%
  mutate(
    or = exp(logratio)
  ) %>%
  top_n(100, or) %>%
  arrange(-1*or, word) %>%
  mutate(log_odds_weighted = round(log_odds_weighted), 
         or = round(or, digits = 1))

# write out the top-100 samples
write_csv(wlo_female_highor, "wlo_female_top100.csv")
write_csv(wlo_male_highor, "wlo_male_top100.csv")

##
# We write out the CSV files and do further formatting in Excel 
##

### Prediction replication of figure 3 analysis
## general regression
fe2_1k <- 
  feols(100*female_trial_est ~ min_f + maj_f + all_f | year^teamsize + year^journal_id, dat_pred)

## matching part
#data:
#year-teamsize-mesh data for prediction
dm_all_f_pred <- dm_all_f %>%
  left_join(select(dat_pred, PMID, female_trial_est), by = "PMID")

dm_maj_f_pred <- dm_maj_f %>%
  left_join(select(dat_pred, PMID, female_trial_est), by = "PMID")

dm_min_f_pred <- dm_min_f %>%
  left_join(select(dat_pred, PMID, female_trial_est), by = "PMID")

#dm models
mm_all_f <- feols(100*female_trial_est ~ all_f | year^journal_id + subclass, dm_all_f_pred, weights = ~weights)
mm_maj_f <- feols(100*female_trial_est ~ maj_f | year^journal_id + subclass, dm_maj_f_pred, weights = ~weights)
mm_min_f <- feols(100*female_trial_est ~ min_f | year^journal_id + subclass, dm_min_f_pred, weights = ~weights)


####################  S-44   ###################
etable(fe2_1k, mm_min_f, mm_maj_f, mm_all_f,   
       cluster="PMID")


### Analysis for disease incidence of gender
## Load the cleaned incidence data (cleaning process can be found in pubmed_clean.R)
dat_pred <- frea("clean_data/article_data/incidence_reg1k.csv") %>% as_tibble()

## regressions
# mesh level only
m_validate_incidence_female1 <- 
  feols(log_if ~ binary_female + log_im,  
        data = incidence_reg1k, weights = ~weights)

m_validate_incidence_female2 <- 
  feols(log_if ~ binary_female + binary_male + log_im, 
        data = incidence_reg1k, weights = ~weights)

m_validate_incidence_male1 <- 
  feols(log_im ~ binary_male + log_if, 
        data = incidence_reg1k, weights = ~weights)

m_validate_incidence_male2 <- 
  feols(log_im ~ binary_female + binary_male + log_if, 
        data = incidence_reg1k, weights = ~weights)


####################  S-35   ###################
etable(m_validate_incidence_female1, m_validate_incidence_female2, 
       m_validate_incidence_male1, m_validate_incidence_male2,
       cluster="PMID")


# mesh with team genders
m_team_incf <- feols(log_if ~ min_f + maj_f + all_f + log_im | year^teamsize + year^journal_id, 
                     data = incidence_reg1k, weights = ~weights)

m_team_incm <- feols(log_im ~ min_f + maj_f + all_f + log_if | year^teamsize + year^journal_id, 
                     data = incidence_reg1k, weights = ~weights)


####################  S-43   ###################
etable(m_team_incf, tm_f1, tm_f2, m_team_incm, tm_m1, tm_m2, 
       cluster="PMID")

### Analysis with first and last author

dat_apdx <- dat_1k %>% as.data.table()

dat_apdx[first_author == 1, first_author:= 0]
dat_apdx[first_author == 2, first_author:= 1]
dat_apdx[last_author == 1, last_author:= 0]
dat_apdx[last_author == 2, last_author:= 1]

dat_apdx$min_first_int <- as.logical(as.numeric(dat_apdx$min_f) * dat_apdx$first_author)
dat_apdx$min_last_int <- as.logical(as.numeric(dat_apdx$min_f) * dat_apdx$last_author)
dat_apdx$maj_first_int <- as.logical(as.numeric(dat_apdx$maj_f) * dat_apdx$first_author)
dat_apdx$maj_last_int <- as.logical(as.numeric(dat_apdx$maj_f) * dat_apdx$last_author)

dat_apdx <- dat_apdx[!is.na(f100) & !is.na(m_1)]
dat_apdx <- as_tibble(dat_apdx)

fe_allyears <- feols(f100 ~ min_f + maj_f + all_f + min_first_int + min_last_int + 
                       maj_first_int + maj_last_int | year^journal_id + year^teamsize + year^m_1, dat_apdx)

fe_pre2012 <- feols(f100 ~ min_f + maj_f + all_f + min_first_int + min_last_int + 
                      maj_first_int + maj_last_int | year^journal_id + year^teamsize + year^m_1, 
                    dat_apdx %>% filter(year < 12))

fe_post2012 <- feols(f100 ~ min_f + maj_f + all_f + min_first_int + min_last_int + 
                       maj_first_int + maj_last_int | year^journal_id + year^teamsize + year^m_1, 
                     dat_apdx %>% filter(year > 11))

####################  S-42   ###################
etable(fe_allyears, fe_pre2012, fe_post2012, 
       cluster="PMID")